###############################################################################
## Data
###############################################################################

## SOEP Data Set
soep <- read.dta("dat/proc-data/soepv35_rents.dta") %>%
  droplevels()

## Inspect missingness by year
soep_missings <- soep %>%
  split(soep$year) %>%
  sapply(function(dat)
    apply(dat, 2, mdesc, mean = TRUE))

## Drop empty variables
soep <- soep %>%
  dplyr::select(-tie_larea,-want_move,-pref_bcty,-pref_mcty,-pref_town,-pref_cntr,-wr_house)

## Drop irrelevant variables
soep <- soep %>%
  dplyr::select(
    -born,-bl,-citizen,-pinc_labor,-log_pinc_labor,-pinc_gross_ttl,-hinc_gross_ttl,-temp,-involpart,-zlwork,-emplrel_r,-emplno_r,-class16_r,-class5_r,-class4,-class6_r,-myclass4,-hh_class,-task,-nace,-starts_with("wsa"),-wttr_job,-cold_rent,-warm_rent,-union
  )
soep <- soep %>%
  dplyr::select(-hh_num_child,-hh_num_ecact,-unemp00,-contact_neighbors)
soep <- soep %>%
  dplyr::select(-stf_larng,-stf_larea)
soep <- soep %>%
  dplyr::select(
    -starts_with("__0"),-moved_area_type,-movein_resmove,-mover_type,-mover_type2,-reason_for_moving
  )

## Interpolate assets
soep <- soep %>%
  group_by(id) %>%
  arrange(year) %>%
  mutate_at(
    .vars = vars(starts_with("asset") & !contains("type")),
    .funs = ~ zoo::na.approx(
      .,
      maxgap = 4L,
      rule = 2,
      na.rm = FALSE
    )
  ) %>%
  mutate_at(
    .vars = vars(starts_with("asset") & contains("type")),
    .funs = ~ zoo::na.locf(
      .,
      maxgap = 4L,
      rule = 2,
      na.rm = FALSE
    )
  ) %>%
  ungroup()

## Factors to numeric
soep <- soep %>%
  mutate_at(.vars = vars(starts_with("stf_"), lr_self),
            .funs = ~ as.numeric(.))

## Person-year ID, filter years before 2005
soep <- soep %>%
  mutate(pyid = paste(id, year, sep = ".")) %>%
  filter(year >= 2005)

## Ownership etc
soep <- soep %>%
  mutate(renter_no_rent = as.numeric(owner == "renter (no rent)")) %>%
  mutate(owner = as.numeric(owner == "owner")) %>%
  droplevels()

## Recode missing category in wr_jbsec
soep <- soep %>%
  mutate(
    wr_jbsec =
      recode_factor(
        wr_jbsec,
        "[-6] Fragebogenversion mit geaenderter Filterfuehrung" = NA_character_
      )
  ) %>%
  droplevels()

## Recode convoluted level names in lm_part
soep <- soep %>%
  mutate(lm_part =
           recode_factor(
             lm_part,
             "Active, Atypical" = "Atypical",
             " (Mostly) inactive" = "Inactive"
           )) %>%
  droplevels()

## Moving and missingness
soep <- soep %>%
  mutate(mover = as.numeric(mover) - 1L) %>%
  mutate(mover = coalesce(mover, 0L))
soep <- soep %>%
  mutate_at(
    .vars = vars(
      starts_with("moved_"),
      starts_with("reason_for_moving"),
      social_housing
    ),
    .funs = ~ ifelse(mover == 0 & is.na(.),-2, .)
  )

## Sample restriction: Only keep individuals who joined the panel pre-2015
soep <- soep %>%
  filter(erstbefr <= 2014)

###############################################################################
## Imputation of SOEP Data
###############################################################################

## Define variables: ID, nominal ordinal, left-bounded (0)
soep_ids <- c(
  "hh_id",
  "pyid",
  "wave",
  "lweight",
  "xweight1",
  "xweight2",
  "xweight3",
  "weight",
  "psample",
  "erstbefr",
  "regbez",
  "ggk",
  "gtyp",
  "bik",
  "gkz",
  "kkz",
  "plz",
  "change",
  "gkzname",
  "ror96",
  "nuts3",
  "nuts2",
  "nuts1",
  "kr_kkz",
  "kr_kkz_rek",
  "kr_kkz_rek90",
  "kr_utmost",
  "kr_utmnord",
  "kr_uemprate",
  "kr_foreigner",
  "kr_emprate",
  "kr_area",
  "kr_hhinc",
  "kr_gdp_pem",
  "kr_gdp_pc",
  "resmove",
  "distance",
  "chg_kkz",
  "chg_zip",
  "plz_lag",
  "kkz_lag",
  "kr_kkz_rek_lag",
  "resmove_full",
  "chg_kkz_full",
  "chg_zip_full",
  "pgexpue",
  "class8_r",
  "warm_rent_load",
  "warm_rent_sqm",
  "selfem_mainjob",
  "task3",
  "hinc",
  "hinc_eq",
  "imputed_rent",
  "yrs_active",
  "age5",
  "prop_rentinc_hinc",
  "pyid",
  "unemp_frac",
  "owner",
  "renter_no_rent",
  names(soep)[grepl("asset", names(soep))],
  "mover",
  "moved_finances",
  "moved_size",
  "moved_furnishings",
  "moved_area",
  "moved_environment",
  "moved_connections",
  "moved_neighborhood",
  "moved_contract",
  "hh_at_current_address",
  "reason_for_moving_costs",
  "reason_for_moving_comfort"
)
soep_nom <- c(
  "hh_comp",
  "myclass4_r",
  "partyid",
  "hh_role",
  "vote",
  "lm_part",
  "fem",
  "migr3",
  "east",
  "social_housing"
)
soep_ord <- c("edu5",
              "wr_jbsec",
              "wr_immig",
              "wr_ecego",
              "wr_ecnat",
              "wr_crime",
              "pol_int")
soep_sqr <- c(
  "log_hinc_eq",
  "yrs_since_movein",
  "cold_rent_sqm",
  "home_size",
  "imputed_rent_per_sqm",
  "hh_mmb",
  "age"
)
soep_con <- c("stf_hinc",
              "stf_dwell",
              "stf_pinc",
              "stf_life",
              "stf_work",
              "lr_self")
soep_lgs <- c("hh_prop_ecact",
              "prop_personal_hinc",
              "hh_prop_child",
              "cold_rent_load")
soep_cs <- "id"
soep_ts <- "year"

## Save indicators of wave missingness
soep_niw <- soep[, grepl("niw_", names(soep))]
names(soep_niw) <-
  substr(names(soep_niw), 5, nchar(names(soep_niw)))
soep_niw <- cbind(soep$year, soep_niw)
names(soep_niw)[1] <- "year"
vars_to_impute <- c(soep_nom,
                    soep_ord,
                    soep_sqr,
                    soep_lgs,
                    soep_con)
soep_niw <- soep_niw %>%
  dplyr::select(c("year",
                  names(soep_niw)[names(soep_niw) %in% vars_to_impute])) %>%
  group_by(year) %>%
  summarize_all(.fun = unique)

## Select relevant variables
vars <- c(soep_nom,
          soep_ord,
          soep_sqr,
          soep_lgs,
          soep_con,
          soep_cs,
          soep_ts,
          soep_ids)
soep <- soep %>%
  dplyr::select(all_of(vars))

## Drop IDs (idvars options return error, we'll add them back later)
soep_to_imp <- soep %>%
  dplyr::select(-all_of(soep_ids))

## Select lead/lag vars
ll_vars <- names(soep_to_imp)[!(
  names(soep_to_imp) %in% c(
    "fem",
    "migr3",
    "east",
    "mover",
    "yrs_since_movein",
    "id",
    "year"
  )
)]

## Proportion bounds
bounds <- matrix(cbind(which(names(soep_to_imp) %in% soep_lgs), 0, 1),
                 nrow = sum(names(soep_to_imp) %in% soep_lgs),
                 ncol = 3L)

## Sequential imputation (Amelia)
soep_imp <- amelia(
  soep_to_imp,
  m = 5,
  noms = soep_nom,
  ords = soep_ord,
  sqrts = soep_sqr,
  bounds = bounds,
  seed = 2021101,
  ts = soep_ts,
  cs = soep_cs,
  autopri = FALSE,
  p2s = 2,
  parallel = "snow",
  ncpus = 5
)

## ----Post - processing----
## Add back unimputed/ID variables
asset_vars <- c(
  "asset_re_hom_p",
  "asset_re_hom_m",
  "asset_re_hom_t",
  "asset_re_oth_p",
  "asset_re_oth_m",
  "asset_re_oth_t",
  "asset_re_ttl_p",
  "asset_re_ttl_m",
  "asset_re_ttl_t",
  "asset_ov_ttl_t",
  "asset_re_type"
)
for (m in seq_along(soep_imp$imputations)) {
  asset_vars_m <- sapply(asset_vars, paste0, m)
  soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
    left_join(soep %>%
                dplyr::select(id,
                              year,
                              all_of(soep_ids[!(startsWith(soep_ids, "asset"))]),
                              all_of(asset_vars_m)),
              by = c("id", "year"))
}

## Retrieve imputed values from transformed variables
for (m in seq_along(soep_imp$imputations)) {
  soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
    mutate(
      hinc_eq = exp(log_hinc_eq) - 1,
      log_hinc = log((exp(log_hinc_eq) - 1) * sqrt(hh_mmb)),
      imputed_rent = imputed_rent_per_sqm * home_size,
      age5 = case_when(
        age < 25 ~ "< 25",
        age >= 25 & age <= 34 ~ "25-34",
        age >= 35 & age <= 44 ~ "35-44",
        age >= 45 & age <= 54 ~ "45-54",
        age >= 55 ~ ">=55",
        TRUE ~ NA_character_
      )
    ) %>%
    mutate_at(.vars = vars(starts_with("stf_")),
              .funs = ~ (. - 1)) %>%
    mutate_at(.vars = vars(starts_with("wr_")),
              .funs = ~ (-as.numeric(.)) + 2) %>%
    mutate_at(.vars = vars(wr_jbsec, stf_work),
              .funs = ~ ifelse(!(lm_part %in% c(
                "Active", "Atypical"
              )), NA_integer_, .)) %>%
    mutate_at(
      .vars = vars(cold_rent_sqm, cold_rent_load),
      .funs = ~ ifelse(owner == 1, NA_real_, .)
    ) %>%
    mutate_at(
      .vars = vars(
        starts_with("moved_"),
        starts_with("reason_for_moving"),
        social_housing
      ),
      .funs = ~ ifelse(. == -2, NA_integer_, .)
    )
}

## Recode wave-wise missing items back to missing
for (m in seq_along(soep_imp$imputations)) {
  for (v in 2:ncol(soep_niw)) {
    v_name <- names(soep_niw)[v]
    to_drop <- soep_niw$year[soep_niw[[v_name]] == 1]
    soep_imp$imputations[[m]][[v_name]][soep_imp$imputations[[m]][["year"]] %in% to_drop] <-
      NA
  }
}

## Save
save(soep_imp, file = "dat/proc-data/soep_imp.RData")
